package org.wisdomdata.segment;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.FileUtils;

/**
 * 这个类利用 HanLP 开源自然语言处理工具，对  Sogou 语料库进行分词。
 * @author clebeg	2015-04-17 14:46
 */
public class SogouCorpusSegmenter {
	//sogou 语料库所在的位置
	private static String sogouCorpusPath = "F:/BaiduYunDownload/SogouCA";
	private static String fileEncoding = "GBK";
	//分词之后的文件，保存到一个文件中
	private static String segmentedFile = "segmented/sogou_segmented.txt";
	
	private static String contentExpReg = "(<content>)(.*?)(</content>)";
	private static ClebegSegmenter segmenter = new AnsjSegmenter();

	public void doSegmentTask() throws IOException {
		File corpus = new File(sogouCorpusPath);
		File segment = new File(corpus, segmentedFile);
		if (segment.getParentFile().exists() == false)
			segment.getParentFile().mkdir();
		
		File[] listCorpus = corpus.listFiles();
		
		//下面开始一个一个处理所有的语料
		int count = listCorpus.length;
		for (int i = 1; i <= count; i++) {
			System.out.println(i + "/" + count +" 正在处理文件：" + listCorpus[i-1].getName());
			String contents = FileUtils.readFileToString(listCorpus[i-1], fileEncoding);
			List<String> ls = findMatchContents(contents, contentExpReg);
			FileUtils.writeLines(segment, "UTF-8", ls, true);
			listCorpus[i-1].delete();
		}
		
	}
	
	private List<String> findMatchContents(String content, String pattern) {
		List<String> contents = new ArrayList<String>();
		
		Pattern pt = Pattern.compile(pattern);
		Matcher m = pt.matcher(content);
		while (m.find()) {
			int count = m.groupCount();
			if(count > 2) {
				String tmp = m.group(2).trim();
				if (tmp.equalsIgnoreCase(""))
					continue;
				
				List<String> setences = segString2Sentence(tmp);
				for (String sentence : setences) {
					contents.add(segmenter.segment(sentence));
				}
			}
		}
		return contents;
	}
	
	public List<String> segString2Sentence(String content) {
		//初始化一个分句容器
		List<String> sentences = new ArrayList<String>();
		StringBuffer sb = new StringBuffer(content);
		int len = sb.length();
		int begin = 0;
		for (int i = 0; i < len; i++) {
			if (sb.charAt(i) == '。' || sb.charAt(i) == '.' || sb.charAt(i) == '！' ||
					sb.charAt(i) == '：' || sb.charAt(i) == '；' ||sb.charAt(i) == '？') {
				String sentence = sb.substring(begin, i).trim();
				if (sentence != null && sentence.equals("") == false)
					sentences.add(sentence);
				begin = i+1;
			}
		}
		return sentences;
	}
	
	public static void main(String[] args) {
		SogouCorpusSegmenter segmenter = new SogouCorpusSegmenter();
		try {
			if (args.length == 0) {
				System.err.println("必须给出非分词语料库的存放目录，并且全部语料文件放到此文件夹下面。");
				System.exit(1);
			} else {
				sogouCorpusPath = args[0];
				System.out.println("程序将自动对 " + args[0] + " 下面的所有sogou语料文件分词...");
				System.out.println("并且在此目录下面新建 segmented目录，所有分词结果保存到下面的 sogou_segmented.txt 中。");
			}
			segmenter.doSegmentTask();
		} catch (IOException e) {
			e.printStackTrace();
		}
	}
}
